#!/bin/bash
#
# outbreak2 : 
# a (hopefully) more robust command to copy the source
# disk to clients.
#
# usage:
#   ./outbreak2 [options] src_file dst_file
#
# try:
#    ./outbreak2 -h (or --help)
# and
#    ./outbreak2 --long_help
# for more details.
#
# make sure we do this at the beginning
# $_ is the path name with which this program
# is invoked. 
this_file=$_
cur_dir=$(pwd)
if [ "${this_file:0:1}" = "/" ]; then
    # this_file is an absolute path name
    this_file_abs=${this_file}
else
    # this_file is a relative path name
    this_file_abs=${cur_dir}/${this_file}
fi
this_dir=$(dirname ${this_file_abs})

# if we have this many clients, it never quits, no
# matter how many fractions of the clients have 
# failed (for example, if we set this 5, then
# the system never quits as long as you have 5
# clients).  use this value if you have a specific
# requirements about the number of clients you 
# make and do not care the success RATIO (e.g., 
# you start from 30 clients and do not care even
# if 25 failed and you get only 5).
# by default, set to a very large value
# so that it essentially has no effects.  whether
# the system keeps going or bails out is determined
# by the RELATIVE progress and failure RATE.
# (see continue_check below)
never_quit_above=100

#
# resume from the previous run using the check point
# file. by default, we don't use it (start from the
# beginning at each time). 
#
resume=0
ignore_timestamp=0

#
# verify after writes (not implemented yet)
#
verify=0

#
# this command repeats invoking blkcpc to copy a 
# block of data.  blk_sz specifies the size of
# data copied per invocation of blkcpc.
#
blk_sz=$((128 * 2 ** 20))
pkt_sz=$((32 * 1024))

#
# timeout for transfering blk_sz bytes. 0 means
# it is calculated by blk_sz and an "normal" disk
# write speed (plus some margin). leave it to 0
# unless you know what it means.
#
timeout=0

#
# 1 if local test mode. when it is set, this command
# will do everything locally. it copies test_file_src
# in the current directory into test_file_dst of the
# clients (they move to their own temporary directories
# so it's safe for all of them to copy into test_file_dst).
#
#mode=local
mode=production

#
# when set by --disk_check option, do not transfer
# data and write junk data to disks
#
disk_check=0
net_check=0

#
# number of clients when mode=local
#
n_local_clients=4

#
# verbosity level
#
verbosity=1

#
# wallpaper is a misnomer. if set to 1, 
# a window pops up whosing the status 
# of each client
#
wallpaper=1

#
# fault injection probability.
# they are solely for debugging fault tolerant features.
# when read_fault>0.0, each read system call performed by
# blkcpc command fails with this probability. similarly for 
# send, recv, write, close, closesocket.  the behavior of 
# the injected fault is determined by block_on_fault.  
# when this is set, the process blocks (and does not exit)
# forever, until it is killed. this command handles this
# type of fault by giving --timeout option to gxpc that 
# invokes blkcpc. when block_on_fault is not set, then
# the process that experienced an injected fault immediately
# exits with status 1. 
#
read_fault=0.0
send_fault=0.0
recv_fault=0.0
write_fault=0.0
close_fault=0.0
closesocket_fault=0.0
block_on_fault=1

#
# run=0 for dryrun
# ask=1 to ask the user if it should go ahead
#
run=1
ask=1

#
# help
#
help=0
long_help=0

#
# brief usage
#
usage() {
    echo "USAGE:"
    echo "  $0 [options] [src] [dst]"
    echo ""
    echo "OPTIONS:"
    echo "  -h,--help"
    echo "  --long_help"
    echo "  -n,--dryrun"
    echo "  -f,--force"
    echo "  --resume"
    echo "  --ignore_timestamp"
    echo "  --verify"
    echo "  --disk_check"
    echo "  --net_check"
    echo "  --no_wallpaper"
    echo "  --mode local/production [${mode}]"
    echo "  --verbosity 0-4 [${verbosity}]"
    echo "  --blk_sz N [${blk_sz}]"
    echo "  --pkt_sz N [${pkt_sz}]"
    echo "  --timeout S [${timeout}]"
    echo "  --never_quit_above [${never_quit_above}]"
    echo "  --read_fault p [${read_fault}]"
    echo "  --send_fault p [${send_fault}]"
    echo "  --recv_fault p [${recv_fault}]"
    echo "  --write_fault p [${write_fault}]"
    echo "  --close_fault p [${close_fault}]"
    echo "  --closesocket_fault p [${closesocket_fault}]"
    echo "  --block_on_fault 0/1 [${block_on_fault}]"
}

#
# long usage (still under construction)
#
long_usage() {
    cat <<EOF

DESCRIPTION: 

  outbreak2 copies SRC on the local machine
  into DST on client machines.  See EXAMPLES
  section for some examples.  Without any
  option, it will copy /dev/sda of the local
  machine into /dev/sda of all client
  machines.  Client machines are listed in
  ping.mk in the same directory of outbreak2
  (${this_dir}).  By default, it is all
  machines having IP addresses
  10.0.3.100-10.0.3.199 and responding to
  ping.  In any case, it briefly shows what
  is going happen (which file is copied to
  which clients) and asks you if it can go
  ahead.  -f option forces it to go ahead
  without asking.

THE BASIC BEHAVIOR: 

  Since this command is primarily intended to
  copy an entire drive typically hundreds of
  giga bytes large, it has provisions for
  gracefully handling faults.

  (i) When a client experiences a fault,
  other healthy clients can continue;
  outbreak2 actually judges if it should
  continue with remaining clients or stop the
  entire process (bail out). When an error
  occurs, you can hopefully just sit down and
  keep watching until it finishes with the
  remaining clients or bails out.

  The criteria used to decide if it continues
  or bails out is that it bails out when so
  many clients failed so early.  The basic
  idea behind this is when it happens,
  restarting from the beginning won't lose
  too much time.  You also do not want to
  keep waiting for the remaining clients to
  finish when the harvest you can expect is
  already low.  More specifically, it
  continues when the following is satisfied.

   fraction of remaining clients
     + fraction of already copied data > 0.9

  (ii) When the entire process bails out, you
  can still resume from the point where the
  first fault happens, with --resume and
  --ignore_stamp option (see the section
  "BEHAVIOR OF --RESUME OPTION" below).  To
  this end, outbreak2 is writing the progress
  to a checkpoint file (ckpt.db) in the
  current directory.  When --resume option is
  given, it reads the checkpoint file to
  determine where it should start again.

  During the operation, a window showing the
  status will pop up on the desktop of each
  client.  It shows hostname, IP address,
  working directory, a globally unique ID
  (that can be matched with text messages on
  the local terminal), and the status
  (started, succeeded, failed, bailed out).
  As soon as a client drops out due to a
  fault, the window indicating the failure
  will show up on the client.

  On the local terminal, various progress and
  event messages will be shown, including how
  much progress has been made and the number
  of clients still engaged.

A DETAILED BEHAVIOR UPON A FAILURE: 

  It copies the source drive (or a file, for
  that matter) by dividing the drive into a
  number of blocks.  A single block is read,
  transferred, and written at a time.  A
  single block is 128MB by default
  (changeable by --blk_sz option).  Progress
  is managed and failures handled on a
  block-by-block basis.

  When copying a block to all clients fails
  for any reason, then it tries to detect and
  remove suspicious clients.  First, it
  detects any client that is unreachable (by
  means of gxpc ping command) and remove
  them.  Second, it lets reachable clients
  write some random data to the range of the
  destination file that have just experienced
  a failure, to detect clients that suffer
  from broken disks.  Such clients are
  removed.  Then it retries to copy the same
  block to still remaining clients.  When it
  fails again, it could repeat the above
  ping--random writes--copy cycle any number
  of times.  Any client that has failed in
  the above process (either because it did
  not respond to ping or failed to complete
  the random write) won't participate in the
  rest of the process.

BEHAVIOR OF --RESUME OPTION:

  Each time you run outbreak2, it will copy
  the entire drive from the beginning.  If
  you want to start from the point where the
  first failure occurs in the last
  invocation, you can give --resume option,
  possibly with --ignore_timestamp.  But you
  better understand how it works before you
  use it.  Carelessly using it might end up
  with a partially copied disk.

  If you are impatient, remember the
  following.

  (i) outbreak2 will show the offset it would
  start from and ask you if it should go
  ahead.  See the value to see if it makes
  sense to you (i.e., where the first fault
  occurred during the last run).

  (ii) --resume --ignore_timestamp is safe
  ONLY WHEN YOU DO NOT CONNECT NEW CLIENTS,
  RECONNECT/REBOOT FAILED CLIENTS, NOR WRITE
  TO THE FAILED DISK.  In short, use
  --ignore_timestamp only when you leave
  failed machines as they are (don't reboot them,
  don't modify their disks) and rerun outbreak2
  again from that state.

  (iii) --resume (without --ignore_timestamp)
  SHOULD BE always safe, but in my
  experiences so far, is overly conservative
  for reasons I do not yet understand; there
  are situations where it misses an
  opportunity to skip blocks.  It happens
  that it says it would start from the offset
  0 (beginning), even though all clients have
  progressed in the last invocation.  So at
  this point the choice between with and
  without --ignore_timestamp is on you.
  Without it, you might have to lose all the
  work you have done.  With
  --ignore_timestamp, you must understand
  when it is safe and guarantee that it is.

  Here is how --resume and --ignore_timestamp
  work.

  Copying the entire drive consists of
  copying blocks.  All clients are engaged in
  copying the same block at the same time.
  Whenever a single block has been written to
  all working clients, outbreak2 will record
  that fact to a checkpoint file
  (ckpt.db). It is a simple human-readable
  text file and its each record looks like:

    addrs|path|mtime|size

  ADDRS column is the list of IP addresses of
  the client, as obtained by hostname -I
  command (there may be multiple IP
  addresses).  PATH is the canonical absolute
  path name of the destination file, as
  obtained by os.path.realpath method of
  python module.  MTIME is the modification
  time of the destination file as obtained by
  os.stat(..).st_mtime.  SIZE is the number
  of bytes that have been successfully
  written.

  Whenever a single block has been written to
  all clients, outbreak2 will update the
  checkpoint file.  In this process, the pair
  (addrs, path) is used as a key.  MTIME is
  the modification time of the destination
  file as reported by the client.  To
  summarize, each record indicates how far
  each client (identified as the addresses
  reported by hostname -I) has gone, along
  with the modification time corresponding to
  the last block.

  When outbreak2 is executed with --resume
  option, it consults the checkpoint file and
  matches clients that participate this time
  with records in the checkpoint file.  For
  each client, if there is a matching record,
  whose definition is given below, it assumes
  the first SIZE bytes has been written to
  that client so can be skipped at this time.

  When it looks up the checkpoint file for a
  match, it uses the triple ADDRS, PATH,
  MTIME as the key if --ignore_timestamp is
  NOT given.  If it IS given, it uses the
  pair ADDRS, PATH as the key (ignoring the
  MTIME field).

  Note that we cannot assume IP ADDRS and
  PATH reliably identify the physical machine
  and physical drive; even if there is a
  record having a matching ADDRS and PATH
  fields, there is no guarantee that it is
  referring to the same physical drive of the
  same physical machine as the one we worked
  on the last time.  Even if it is indeed the
  same physical drive, it may have been
  written to (thus invalidated) for other
  reasons.  To detect such cases, we compare
  the MTIME field of the drive we are going
  to work and the MTIME in the checkpoint
  file.  If the triple matches, it will be
  safe in practically all cases, unless I am
  missing something.

  Ideally you never want to use
  --ignore_timestamp.  In my experiences,
  however, it happens that MTIME at the time
  of the second invocation is somehow changed
  from the value recorded in the checkpoint.
  I don't know how this happens, but until
  the issue is fully resolved, you may want
  to use --ignore_timestamp at your own risk.

  Finally, now you probably understand when
  --ignore_timestamp is safe.  It is safe
  when nothing happens that invalidates the
  binding between ADDRS and the physical
  machine or the binding between the PATH and
  the physical drive and the physical drive
  is not modified in any way.

DISK COPY VS. FILE COPY:

  Although its primary purpose is to copy an
  entire disk, which is a special (device)
  file, it can also be used to copy a regular
  file.  You may simply give regular file
  names as the source and destination files.
  e.g.,

  ./outbreak2 big_file_src big_file_dst

or

  ./outbreak2 big_file 

the latter of which is equivalent to 

  ./outbreak2 big_file big_file 

  This feature is mainly used for debugging
  and testing this program (you don't want to
  copy /dev/sda each time you test it, do
  you?).

  Care must be taken how to interpret the
  relative path name given as the destination
  file.  A unique directory is created for
  each client and it is used for the base
  path of relative path names; thus, the
  relative destination path becomes unique
  EVEN IF SOME CLIENTS RUN on the same
  machine.

LOCAL TEST MODE: 

  To facilitate debugging and testing, it has
  a --mode=local option.  When --mode=local
  is given, all clients run on the same
  machine.  See the description of --mode
  option below.


OPTIONS:

  --resume : resumes from the point a failure
    occurred in the last invocation, rather than
    starting from the beginning of the file. See 
    BEHAVIOR OF --RESUME OPTION below for details.

  --verify : (not implemented yet) will
    verify the specified source file on the
    local machine is identical to the
    destination files on the client machines.

  --no_wallpaper : does not show status windows on 
    clients.  

  --mode local/production : --mode local can be
    used to locally test this program. when --mode
    local is given, it uses local processes as 
    clients.  The number of clients can be specified
    by --n_local_clients.  When this option is used,
    you cannot omit the source file name.  Also, 
    you probably do not want to use an absolute path 
    name for destination file name, as it will refer
    to the same file among all clients.  Take advantage
    of the fact that relative path names are relative
    to a directory unique to each client.  So, for 
    example, 
      ./outbreak2 --mode local foo bar
    is safe (bar becomes unique to each client).

  --verbosity V : controls the verbosity level. 

  --blk_sz N : determines the unit of work.

    outbreak2 repeatedly invokes blkcpc
    command (of GXP) to copy the file.  This
    option specifies how much data will be
    transferred on each invocation of blkcpc
    command.  As each invocation of blkcpc
    will open, seek, read/write, and close
    the source/destination files, giving too
    small a blk_sz value will result in large
    overhead.  On the other hand, outbreak2
    will show the progress and checkpoint it,
    you do not want to make it too large.

  --timeout S : specifies the timeout value
    of each blkcpc command.  The default
    value (0) means it is determined from the
    blk_sz value.

  --never_quit_above N : specifies the number
    of clients above which outbreak2 will
    never bail out.  Normally, outbreak2 will
    stop the entire process when it judges
    so many clients have failed so early that
    restarting the copy from scratch (perhaps
    with new machines or after fixing a problem
    that makes faults so common) won't lose too much
    time.  By specifying this value to a small
    value, you can guarantee outbreak2 won't give
    up as long as that many clients still remain.

  --read_fault p :
  --send_fault p : 
  --recv_fault p :
  --write_fault p :
  --close_fault p :
  --closesocket_fault p :

    specify the fault injection probability
    of the respective operation in blkcpc
    command.  --read_fault 1.0e-2 means any
    read system call will fail with
    probability 1.0e-2.

  --block_on_fault 0/1 : 1 means upon an
    injected fault, the blkcpc process will
    indefinitely block until outbreak2 killed
    them.


FILES:

ckpt.db : outbreak2 will create this file to
   record how much data were successfully
   copied to each client.  It is a
   |-separated text file containing IP
   addresses, absolute path to the
   destination file, the modification time,
   and the offset to which the copy
   succeeded.  when --resume option is given,
   outbreak2 consults this file to skip data
   that can be surely omitted.

/tmp/USER/outbreak/N : a working directory of
   client N, so that each client has a unique
   working directory even if some are running
   on the same machine.  Remember that a
   relative pathname given as the destination
   file is relative to this directory.

status.svg : the status file drawn on the
   desktop of each client.  It is created in
   one of the following directories (whichever
   exists).
   /home/USER/Desktop
   /home/USER/デスクトップ
   /tmp/USER/outbreak/N

status.pid : a file containing the pid of the
   process showing the status.svg.  The file
   is used to kill it when the status of the
   client changes to the process should be
   gone.  It is created in the same directory
   as status.svg

EXAMPLES:
(0) ./outbreak2 -n

  shows you what is going to happen and exits
  without actually doing work. Always do this
  before you actually go ahead.

(1) ./outbreak2 

  copies /dev/sda of the local machine into
  /dev/sda of all ping-reachable machines
  having IP addresses 10.0.3.100-10.0.3.199.
  Make sure the drive you want to copy is
  /dev/sda.  The IP address range is
  hardwired in ping.mk in this directory.
  Modify ping.mk when you want to change this
  behavior.

(2) ./outbreak2 /dev/sdb

  ditto, but copies /dev/sdb instead of
  /dev/sda.  Use this when you know the drive
  you want to copy is /dev/sdb.

(3) ./outbreak2 /dev/sdb /dev/sda

  copies the /dev/sdb of the local machine to
  /dev/sda of client machines.

(4) ./outbreak2 --resume 

  resumes copying from the point where the
  first fault occurred in the last run.  See
  BEHAVIOR OF --RESUME OPTION above for
  details.

(5) ./outbreak2 --mode local foo bar

  copies FOO under the current directory to
  /tmp/USER/outbreak/{0,1,2}/bar.
  Non-existing directories are created.

EOF
}

#
# prepare for command line parsing.
#    eval set -- xx yy zz ..
# sets the command line of this script to xx yy zz ..
#
# getopt ... command prints the "canonicalized" command line,
# so that command line processing becomes streamlined.
#
# together, eval set -- $(getopt ...)
# 
# will set the command line of this process that can be
# easily parsed.
#
eval set -- $(getopt -o "hnf" --long help,long_help,force,dryrun,resume,ignore_timestamp,verify,disk_check,net_check,no_wallpaper,blk_sz:,pkt_sz:,timeout:,never_quit_above:,mode:,n_local_clients:,verbosity:,read_fault:,send_fault:,recv_fault:,write_fault:,close_fault:,closesocket_fault:,block_on_fault: -- "$@")

parse_args() {
    # the command line has been canonicalized, so we simply
    # loop over $1 $2 $3 $4 ... 
    while true ; do 
	case "$1" in
	    --help)
		help=1
		shift 1
		;;
	    -h)
		help=1
		shift 1
		;;
	    --long_help)
		long_help=1
		shift 1
		;;
	    -n)
		run=0
		shift 1
		;;
	    --dryrun)
		run=0
		shift 1
		;;
	    -f)
		ask=0
		shift 1
		;;
	    --force)
		ask=0
		shift 1
		;;
	    --resume)
		resume=1
		shift 1
		;;
	    --ignore_timestamp)
		ignore_timestamp=1
		shift 1
		;;
	    --verify)
		verify=1
		shift 1
		;;
	    --disk_check)
		disk_check=1
		shift 1
		;;
	    --net_check)
		net_check=1
		shift 1
		;;
	    --no_wallpaper)
		wallpaper=0
		shift 1
		;;
	    --blk_sz)
		blk_sz="$2"
		shift 2
		;;
	    --pkt_sz)
		pkt_sz="$2"
		shift 2
		;;
	    --timeout)
		timeout="$2"
		shift 2
		;;
	    --never_quit_above)
		never_quit_above="$2"
		shift 2
		;;
	    --mode)
		mode="$2"
		shift 2
		;;
	    --n_local_clients)
		n_local_clients="$2"
		shift 2
		;;
	    --verbosity)
		verbosity="$2"
		shift 2
		;;
	    --read_fault)
		read_fault="$2"
		shift 2
		;;
	    --send_fault)
		send_fault="$2"
		shift 2
		;;
	    --recv_fault)
		recv_fault="$2"
		shift 2
		;;
	    --write_fault)
		write_fault="$2"
		shift 2
		;;
	    --close_fault)
		close_fault="$2"
		shift 2
		;;
	    --closesocket_fault)
		closesocket_fault="$2"
		shift 2
		;;
	    --block_on_fault)
		block_on_fault="$2"
		shift 2
		;;
	    --)
		shift 1
		break;
		;;
	    *)
		echo "outbreak: error: unknown option ($1)"
		return 1
		;;
	esac
    done

    # set the default timeout value.
    # assume a conservative estimate of the
    # (disk) bandwidth: 8MB.
    # plus 10 seconds margin.
    #
    if [ "${timeout}" = "0" ]; then
	# 8MB + 10 sec margin
	bw=$((8 * 1024 * 1024))
	timeout=$((${blk_sz} / ${bw} + 10))
    fi

    # determine source and destination file
    default_src=""
    if [ "${mode}" = "production" ]; then
	default_src=/dev/sda
    fi
    src_file=${1:-${default_src}}
    dst_file=${2:-${src_file}}
    if [ "${src_file}" = "" ]; then
	echo "outbreak: error: you should supply the source file when --mode=local"
	return 1
    fi
    if [ -b ${src_file} ]; then 
	# device file -> do not creat/trunc the destination file
	creat=0
	trunc=0
    else
	creat=1
	trunc=1
	#trunc=0
    fi

    # make src_file absolute, so that client 0 can find
    # it after changing to his temp dir.
    # ??? or should we avoid changing dir on client 0?

    # check point file that tracks how many bytes have been 
    # sucessfully copied
    ckpt_db="ckpt.db"
    return 0
}

print_args() {
    echo resume="${resume}"
    echo ignore_timestamp="${ignore_timestamp}"
    echo verify="${verify}"
    echo creat="${creat}"
    echo trunc="${trunc}"
    echo disk_check="${disk_check}"
    echo net_check="${net_check}"
    echo wallpaper="${wallpaper}"
    echo blk_sz="${blk_sz}"
    echo pkt_sz="${pkt_sz}"
    echo timeout="${timeout}"
    echo never_quit_above="${never_quit_above}"
    echo mode="${mode}"
    echo n_local_clients="${n_local_clients}"
    echo verbosity="${verbosity}"
    echo read_fault="${read_fault}"
    echo send_fault="${send_fault}"
    echo recv_fault="${recv_fault}"
    echo write_fault="${write_fault}"
    echo close_fault="${close_fault}"
    echo closesocket_fault="${closesocket_fault}"
    echo block_on_fault="${block_on_fault}"
    echo src_file="${src_file}"
    echo dst_file="${dst_file}"
}

#
# get_file_size FILE prints the size of FILE.
# open the file and seek the end of the file.
# I don't know if there is a convenient command
# for it; I resorted to python.
#

get_file_size() {
    if [ -e $1 ]; then
	python -c "import os,sys; print os.lseek(os.open(sys.argv[1], os.O_RDONLY), 0, os.SEEK_END)" $1
    else
	echo "error: file \"$1\" does not exist."
	return 1
    fi
}

#
# get clients that respond to ping, to know which
# clients we try to explore. (it used to explore
# all of them and get whichever clients we successfully
# explore, but this is too crude and looks ugly).
# we ping all clients in parallel.
# the trick is in ping.mk file in this directory.
#

get_pingable_clients() {
    ping_mk=${this_dir}/ping.mk
    make -s -f ${ping_mk} -j 
}

#
# clients, depending on it is a production run or local test
#
get_clients() {
    if [ "${mode}" = production ] ; then
	get_pingable_clients
    else 
	echo "client ${n_local_clients}"
    fi
}

#
# explore clients. make a temporary directory under
# /tmp/USER/outbreak/N, where USER is a user name and N is
# a unique index number (0,1,2, ...), and set their
# current directory there.
#
# the purpose of this is mainly to make this tool
# testable within a single node. by having them work
# on a separate directory, we can make the destination
# file unique for each client.
#

bring_up_clients_and_change_dir() {
    if [ "${mode}" = "production" ] ; then
	rsh_method=ssh
    else
	rsh_method=sh
    fi
    clients="$(get_clients)"
    if [ "${verbosity}" -ge 2 ] ; then
	echo "outbreak: clients: ${clients}"
    fi
    gxpc use ${rsh_method} $(hostname) .
    gxpc explore --timeout 15.0 --children_soft_limit 200 ${clients}

    gxpc -G ${src_gupid} export "OUTBREAK_CLIENT_IDX=\${GXP_EXEC_IDX}"

    gxpc -G ${src_gupid} e mkdir -p "/tmp/${USER}/outbreak/\${GXP_EXEC_IDX}"
    gxpc -G ${src_gupid} cd         "/tmp/${USER}/outbreak/\${GXP_EXEC_IDX}"
    if [ "${verbosity}" -ge 1 ] ; then
	echo "outbreak: source:"
	gxpc -g ${src_gupid} e echo "\"  \"\${GXP_GUPID} \$(pwd) ${src_file}"
	n_clients=$(gxpc -G ${src_gupid} ping | wc -l)
	echo "outbreak: ${n_clients} clients:"
	gxpc -G ${src_gupid} e echo "\"  \"\${GXP_GUPID} \$(pwd) ${dst_file}"
    fi
}

#
# check ckpt.sz file in the current directory of all
# clients. ckpt.sz file should have a single number
# indicating the offset it has done.
#

get_offset_from_checkpoint() {
    # sorry for the ugly command ...
    # for each client, obtain the |-separated triple 
    #    ADDRS|PATH|MTIME
    # and feed them to checkpoint.py program.
    # checkpoint.py min_offset ckpt.db prints a single
    # number indicating the offset to start copying 
    # from. For each input triple in ckpt.db, it looks
    # it up in the database.  If any triple is not found,
    # it immediately prints 0 and exits.  If all triples
    # are found, it prints the minimum of the values 
    # found in SIZE column of the database.
    if [ "${ignore_timestamp}" = 1 ]; then
	op=min_offset_ignore_ts
    else
	op=min_offset
    fi
    set -x
    gxpc -G ${src_gupid} e "echo \$(python -c 'import os,sys,stat; ip=sys.argv[1]; f=sys.argv[2]; os.path.exists(f) and sys.stdout.write(\"%s|%s|%f\" % (ip, os.path.realpath(f), os.stat(f).st_mtime))' \"\$(hostname -I)\" ${dst_file})" | ${this_dir}/checkpoint.py ${op} ${ckpt_db}
    set +x
}

#
# set start_offset variable to where we should start this time.
# (1) unless --resume is given, we start from the beginning
# (2) if --resume is given, check checkpoint files
#

set_start_offset() {
    if [ "${resume}" = "0" ] ; then
	if [ "${verbosity}" -ge 1 ] ; then
	    echo "outbreak: --resume not given. ignore any checkpoint file"
	fi
	start_offset=0
    else
	if [ "${verbosity}" -ge 1 ] ; then
	    echo "outbreak: --resume is given,. use checkpoint file ${ckpt_db}"
	fi
	if [ "${verbosity}" -ge 2 ] ; then
	    cat ${ckpt_db}
	fi
	start_offset=$(get_offset_from_checkpoint)
    fi
    gxpc -G ${src_gupid} export outbreak_offset=${start_offset}
}


# a function that determines where we should
# keep going despite that some clients have failed,
# or we should bail out as too few clients are
# still alive. the conditions to keep going are
# (1) the source node has not failed
# (2) "enough" clients have not failed; here "enough"
#     means large enough compared to the progress
#     we have made so far.  for example, if a failure
#     occurs pretty early, it is likely to be good
#     to quit, as we do not lose too much by restarting
#     it from the beginning. on the other hand, if
#     we have copied 90% of the file, we probably should
#     keep going, even if there are only 30% of clients
#     left.

check_continue() {
    #ns=$1			# number of sources (0 or 1)
    #nc=$2			# number of clients still alive
    sz_done=$1 			# bytes that we have copied
    sz_total=$2			# bytes we need to copy in total
    n_clients=$3		# the original number of clients
    # never_quit_above=$6		# if the number of clients is this much,
    # we alywas contine

    ns=$(gxpc -g ${src_gupid} ping | wc -l)
    nc=$(gxpc -G ${src_gupid} ping | wc -l)

    if [ "${verbosity}" -ge 1 ]; then
	prog=$((${sz_done} * 10000 / ${sz_total}))
	px=$((${prog} / 100))
	py=$(printf "%02d" $((${prog} % 100)))
	echo "outbreak: ${ns} source, ${nc}/${n_clients} clients, ${sz_done}/${sz_total} (${px}.${py}% done)" 
    fi

    # if the source has gone, no way to proceed
    if [ ${ns} -lt 1 ]; then
	echo "outbreak: the source failed. bail out."
	return 1
    fi
    # no clients left
    if [ ${nc} -lt 1 ]; then
	echo "outbreak: no clients left. bail out."
	return 1
    fi

    # if the number of clients is larger than
    # the preset threshold, keep going
    if [ ${nc} -ge ${never_quit_above} ]; then
	return 0
    fi

    # otherwise, we determine if we continue or not
    # by the progress we have made and the number of
    # clients alive. the intuition is that, (a) if some
    # clients have died pretty early, we don't lose
    # too much by restarting it from scratch, possibly
    # after replacing failed clients with new ones.
    # (b) if we have made much progress (say 70%), then
    # we should be more patient.  to sum up (a) and (b),
    # we continue if :
    # 
    #  live clients      bytes we have copied 
    # ---------------- + ---------------------- > 0.9
    #  total clients        bytes in total
    #
    # here is an awkward expression to do this with integers
    # (better to do it in awk or something?)
    #
    if [ $((${sz_done} * ${n_clients} + ${nc} * ${sz_total})) -le $((${sz_total} * ${n_clients} * 9 / 10)) ] ; then
	echo "outbreak: too few clients are alive. bail out."
	return 1
    fi

    return 0			# OK, go ahead
}

# copy wall papers to everybody as a preparation.
# cur_dir is the directory in which this script is
# wallpaper_dir is the _relative_ path from their
# current directories (i.e., /tmp/USER/outbreak/N)
copy_stuff() {
    if [ "${verbosity}" -ge 1 ]; then
	echo "outbreak: copy change_status to clients with mw blkcpc"
    fi
    bcpv=$((${verbosity} - 1))
    # cp ${cur_dir}/change_status /tmp/
    n_clients=$(gxpc -G ${src_gupid} ping | wc -l)
    n_copy_ok=$(gxpc mw --master blkcpm blkcpc --verbosity ${bcpv} --creat 1 --trunc 1 ${cur_dir}/change_status ./cs | grep DONE | wc -l)
    if [ "${verbosity}" -ge 2 ]; then
	echo "copied to ${n_copy_ok} out of ${n_clients} clients"
    fi
    n_failed=$((${n_clients} - ${n_copy_ok}))
    if [ "${n_failed}" = 0 ]; then
	return 0
    else
	echo "failed to copy to some clients n_clients=${n_clients}, n_copy_ok=${n_copy_ok}"
	return 1
    fi
}

# 
# a fault tolerant :-) function to copy a block from the
# source to all clients. it uses blkcpm/blkcpc commands,
# now part of gxp3, to fetch a block and copy it to clients.
# we use time out (--timeout) of gxpc command, to continue
# despite failures (very slow disks, network disconnect, etc.). 
# after the command finishes, it examines if any client failed.
# if there is one, it examines reachability and disk health.
#

copy_block() {
    src_file=$1
    dst_file=$2
    #from_offs=$3			# from offset
    #to_offs=$4			# to offset
    from_offs=$((${sz} - $4))			# from offset
    to_offs=$((${sz} - $3))			# to offset

    # remember the set of clients still engaged.
    # this is necessary to change status on the
    # failed clients
    nc=$(gxpc --timeout 3.0 ping | wc -l)
    gxpc savemask before

    if [ "${disk_check}" = 1 ] ; then
	do_trans=0
    else
	do_trans=1
    fi
    if [ "${net_check}" = 1 ] ; then
	do_write=0
    else
	do_write=1
    fi

    # managing clients in the presence of 
    # network/disk failures.
    # the basic command we use to broadcast
    # a block is
    #  mw --master blkcpm blkcpc ... ...
    # when some clients fail to accomplish
    # the above task, we want to detect and 
    # remove broken clients from further 
    #
    # participation.
    # a basic problem is that, since clients
    # collaborate together to broadcast a block,
    # if there is ANY broken client, nobody will
    # succeed. in other words, the status of the 
    # above command is almost all-or-nothing;
    # either everybody succeeds or nobody does.
    # the latter does NOT mean all clients are 
    # broken. to correctly identify broken clients,
    # we need to run a command that can independently
    # succeed yet can accurately predict the health
    # of each.  to this end, we do the following.
    #
    # ping ; drop unresponsive clients ;
    # copy ; if all reponsive clients succeed, done ;
    # repeat:
    #  ping ; drop unresponsive clients again ; 
    #  disk_check ; each client independently writes some junks to the disk ;
    #    drop clients that failed to finish within a time out ;
    #  copy ; if all reponsive clients succeed, done ;

    max_retries=1		# we try (max_retries + 1) times
    for i in $(seq 0 ${max_retries}); do
	if [ $i -gt 0 ] ; then
	    echo "outbreak: retry with reachable processes (try $i, max tries = ${max_retries})."
	    echo "outbreak: ping clients"
	fi
	# ping with timeout 3 sec
	n_ping_ok=$(gxpc --timeout 3.0 ping | wc -l)
	if [ "${n_ping_ok}" -lt "${nc}" ] ; then
	    echo "outbreak: ****** only ${n_ping_ok} process(es) responded (out of ${nc})"
	elif [ $i -gt 0 ] ; then
	    echo "outbreak: ${n_ping_ok} process(es) (source or clients) responded"
	fi
	if [ "${n_ping_ok}" = "0" ]; then break ; fi
	# filter out unreachable guys
	gxpc smask
	if [ $i = 0 ] ; then
	    # no disk check in the first iteration
	    n_disk_ok=$((${n_ping_ok} - 1))
	else
	    echo "outbreak: ****** check disk health of clients ... "
	    n_disk_ok=$(gxpc --timeout ${timeout} mw --master blkcpm "blkcpc --pkt_sz ${pkt_sz} --creat ${creat} --trunc ${trunc} --read_fault ${read_fault} --send_fault ${send_fault} --recv_fault ${recv_fault} --write_fault ${write_fault} --close_fault ${close_fault} --closesocket_fault ${closesocket_fault} --block_on_fault ${block_on_fault} --min_receivers 0 --write 1 --transfer 0 --range ${from_offs}:${to_offs} ${src_file} ${dst_file}" | grep DONE | wc -l)

	    n_failed=$((${n_ping_ok} - ${n_disk_ok} - 1))
	    if [ "${n_failed}" -gt "0" ]; then
		echo "outbreak: ****** only ${n_disk_ok} processes have a working disk (out of $((${n_ping_ok} - 1)))"
		echo "outbreak: ${dst_file} has suspicious sectors in ${from_offs}:${to_offs}"
	    else 
		echo "outbreak: all ${n_disk_ok} process(es) appear to have a working disk"
	    fi
	    if [ "${n_disk_ok}" = "0" ]; then break ; fi
	    gxpc smask
	fi
	# now copy a block, with timeout 10 sec.
	# blkcpc prints "OK" when done and we count the 
	# number of OK lines (TODO: should be made more robust)
	if [ "${verbosity}" -ge 1 ]; then
	    if [ $i = 0 ] ; then
		echo "outbreak: copy range ${from_offs}:${to_offs} ..."
	    else 
		echo "outbreak: ****** retry to copy range ${from_offs}:${to_offs} among clients with working disks ..."
	    fi
	fi
	if [ "${verbosity}" -ge 2 ]; then
	    set -x
	fi

	n_copy_ok=$(gxpc --timeout ${timeout} mw --master blkcpm "blkcpc --pkt_sz ${pkt_sz} --creat ${creat} --trunc ${trunc} --read_fault ${read_fault} --send_fault ${send_fault} --recv_fault ${recv_fault} --write_fault ${write_fault} --close_fault ${close_fault} --closesocket_fault ${closesocket_fault} --block_on_fault ${block_on_fault} --min_receivers 0 --write ${do_write} --transfer ${do_trans} --range ${from_offs}:${to_offs} ${src_file} ${dst_file}" | grep DONE | wc -l)
	set +x

	n_failed=$((${n_disk_ok} - ${n_copy_ok}))
	if [ "${n_failed}" = "0" ]; then
	    if [ $i -gt 0 ] ; then
		echo "outbreak: all ${n_copy_ok} processes succeeded"
	    fi
	    break
	else 
	    echo "outbreak: ****** only ${n_copy_ok} processes succeeded (out of ${n_disk_ok})"
	fi
    done
    # get the set of clients that have succeeded
    gxpc savemask after
    # update checkpoint files of those who have succeeded
    gxpc -G ${src_gupid} export outbreak_offset=${b}

    # modify checkpoint file
    gxpc -G ${src_gupid} e "echo \$(python -c 'import os,sys,stat; ip=sys.argv[1]; f=sys.argv[2]; print \"%s|%s|%f\" % (ip, os.path.realpath(f), os.stat(f).st_mtime)' \"\$(hostname -I)\" ${dst_file})" | ${this_dir}/checkpoint.py update ${ckpt_db} ${b}

    # now, try to change status of the guys who have failed this time.
    # to this end, we restore the set of clients who were engaged
    # before this turn.
    gxpc restoremask before
    # execute "change_status fail" when its ckpt.sz has an obsolete
    # value.
    gxpc -G ${src_gupid} --timeout 2.0 e "[ \${outbreak_offset} -lt ${b} ] && echo outbreak : \${GXP_GUPID} : \$(hostname) \(\$(hostname -I)\) drop out due to an error && ./cs fail"
    # we are all set. get back to the set that has succeeded
    gxpc restoremask after
}





main() {
    if ! parse_args $@; then 
	usage
	return 1
    fi
    if [ "${long_help}" = 1 ] ; then 
	usage
	long_usage
	return 0
    fi
    if [ "${help}" = 1 ] ; then 
	usage
	return 0
    fi
    if [ "${verbosity}" -ge 3 ] ; then 
	print_args
    fi

    # get the size of the source file/disk
    sz=$(get_file_size ${src_file})
    if [ "${sz}" = "" ]; then return 1; fi
    if [ "${verbosity}" -ge 2 ]; then
	echo "outbreak: src=${src_file}, dst=${dst_file}, size=${sz}."
    fi

    # clean up daemons left from the previous run
    gxpc quit

    # bring up the local daemon and get its name
    src_gupid=$(gxpc e echo '${GXP_GUPID}')
    if [ "${verbosity}" -ge 2 ] ; then
	echo "outbreak: source gupid = ${src_gupid}"
    fi

    # get the number of sources; it SHOULD BE one,
    # but we still check if the source has brought up
    n_sources=$(gxpc e hostname | wc -l) #  0 or 1

    # bring up clients and get the number of them.
    # they also go to their working dir
    bring_up_clients_and_change_dir
    
    # count the number of clients (source excluded with -G)
    n_clients=$(gxpc -G ${src_gupid} e hostname | wc -l)

    # if --resume is given, check checkpoint files
    # to determine the starting offset.
    # variable start_offset is set
    set_start_offset
    if [ "${verbosity}" -ge 1 ] ; then
	echo "outbreak: start from offset ${start_offset}"
    fi

    # dry run, if -n or --dryrun are given
    if [ "${run}" = 0 ] ; then exit 0; fi

    # ask if we should go ahead, unless -f or --force are given
    if [ "${ask}" = 1 ] ; then
	echo "should I go ahead [Y/n]?" 1>&2
	reply=n
	read reply
	if [ "${reply}" != "y" -a "${reply}" != "Y" -a "${reply}" != "" ] ; then
	    echo "that's fine, see you later"
	    exit 0
	fi
    fi

    # copy the wallpaper changer and stuff to clients
    if [ "${wallpaper}" = 1 ]; then
	if ! copy_stuff ; then return 1; fi
	if [ "$(whoami)" = "root" ]; then
	    gxpc -G ${src_gupid} e sudo -u ubuntu sh -c '"DISPLAY=:0 xhost +"'    
	fi
    fi

    # change clients' wallpapers to the default one
    if [ "${wallpaper}" = 1 ]; then
	if [ "${verbosity}" -ge 1 ] ; then
	    echo "outbreak: reset client wallpapers to start"
	fi
	gxpc -G ${src_gupid} e ./cs start
    fi

    # keep track of time (+%s prints seconds since epoch)
    # start_time=$(date +%s)
    start_time=$(date +%s%N)	# seconds+nanoseconds

    a=${start_offset}
    # now the real work begins!
    while [ ${a} -lt ${sz} ]; do
	# show current time and seconds since start
	#now=$(date +%s)
	now=$(date +%s%N)
	sec_since_start=$(((${now} - ${start_time}) / 1000000000))
	msec=$(((${now} - ${start_time}) % 1000000000 / 1000000))
	echo "outbreak: $(date +%H:%M:%S) (${sec_since_start}.${msec} sec since start)"
	# judge if we should continue or bail out
	if ! check_continue ${a} ${sz} ${n_clients}; then
	    if [ "${wallpaper}" = 1 ]; then
		gxpc -G ${src_gupid} --timeout 2.0 e ./cs bailout
	    fi
	    return 1		# NG
	fi
	# real work. copy bytes a..b of the file
	b=$((${a} + ${blk_sz}))
	if [ "${b}" -gt "${sz}" ]; then b=${sz}; fi

	copy_block ${src_file} ${dst_file} ${a} ${b}
	a=${b}
    done
    nc=$(gxpc -G ${src_gupid} ping | wc -l)
    gxpc smask
    # congrat. if you have gone thus far, you succeeded.
    # change wallpaper
    if [ "${wallpaper}" = 1 ]; then
	if [ "${verbosity}" -ge 1 ]; then
	    echo "outbreak: set wallpaper to success"
	fi
	gxpc -G ${src_gupid} e ./cs success
    fi
    echo "outbreak: ${nc} clients succeeded:"
    gxpc e echo "\$(hostname) \(\$(hostname -I)\) \${GXP_GUPID}"
}

main $@
